1. Introduction

The purpose of this vignette is to document the process of removing outliers in the data set COS in the environment of R with the aim at exploring different accuraccies that we can achieve in the classification for the year 2017. As part of the supervised classification, besides the imagery for 2017, we account with a training dataset called COS (Carta de uso e Ocupacao do solo) composed by 13000 samples categorized by different LCLU types (1000 samples each type). Since our training dataset differs in date and source of adquisition respect the imagery (labels done by interpretation of aerial imagery), we need to trace the differences between what the labels explain and what actually happens in the ground. In this sense, since all samples are not equally informative for each image, we propose as first attempt to play with the ndvi values, so that we can make asumptions of which samples are going to be used in one image and another not.

Before moving to the methodology, I only want to highlight the land cover types for this study: Holm and Cork_Trees, Herbaceous_permanet, Herbaceous periodic, Bushes and shrubs, Non vegetated, Wetlands, Sealed, Water, Rice_fields, Coniferous_trees, Eucalyptus_trees.

2. Methodology

We recreated the NDVI for 29 images available for the year 2017 an extract the NDVI values to the 12000 points. Since We expect certain homogeneity in the NDVI values per class and per time, we think that the set of those values that do not follow this trend are probably points that may change for natural or anthropogenic reasons; or simply the label is not enough representation of the area covered by the pixel.

Therefore, to measure the statistical dispersion of NDVI per class and per time we are going to use two methods, interquantile ranges and central means.

library(sf)
<<<<<<< HEAD
## Linking to GEOS 3.7.0, GDAL 2.2.3, PROJ 4.9.3
## WARNING: different compile-time and runtime versions for GEOS found:
## Linked against: 3.7.0-CAPI-1.11.0 673b9939 compiled against: 3.6.2-CAPI-1.10.2
## It is probably a good idea to reinstall sf, and maybe rgeos and rgdal too
library(ggplot2)
folder_path = "/home/user/Documents/TESISMASTER/VECTOR/Training_ndvi/training_samples3.shp"
#Days of the images
day_shoot = c("2017/01/15", "2017/04/05", "2017/05/25", "2017/06/14","2017/07/29",
              "2017/08/13", "2017/09/12", "2017/10/12", "2017/11/16","2017/12/16")
=======
## Linking to GEOS 3.6.2, GDAL 2.2.3, PROJ 4.9.3
library(ggplot2)
folder_path = "/home/user/Documents/TESISMASTER/VECTOR/Training_ndvi/training_samples2.shp"
#Days of the images
day_shoot = c("2017/01/15", "2017/04/05", "2017/05/25", "2017/06/04", "2017/06/14",
              "2017/07/04", "2017/07/09", "2017/07/14", "2017/07/24", "2017/07/29",
              "2017/08/03", "2017/08/08", "2017/08/13", "2017/08/18", "2017/09/02" ,
              "2017/09/07", "2017/09/12", "2017/09/22", "2017/09/27", "2017/10/02",
              "2017/10/12", "2017/10/22", "2017/10/27", "2017/11/11", "2017/11/16",
              "2017/11/21", "2017/12/01","2017/12/16", "2017/12/21")
>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f

#converting days characters to time format in R 
day_shoot= strptime(as.character(day_shoot), "%Y/%m/%d")
#reading shapefile
dataset = sf::read_sf(folder_path)

2.1 Method 1: interquantile range analysis

IQR method makes detection of ouliers by looking at values more than one and half times the IQR distance below the first quartil and above the third quartil. IQR analysis is attractive because it considers the median as the measurement of centrality and thus is considered resistant to outliers. The following equations shows how to calculate this range per time for especific land cover type.

\[ IQR^{t} = Q_{3}^{t} - Q_{1}^{t} \] \[ Limits^{t} = Q_{1}^{t} \pm 1.5*IQR^{t} \]

model1 =  function(x, min_x, max_x){
  ind_t = which(x< min_x | x > max_x)
  if(length(ind_t)!=0){
    x[ind_t] = NA  
  }
  qua = quantile(x,na.rm=TRUE)
  q25 =qua[2]
  q75 =qua[4]
  IQR = q75 - q25
  limit_left = q25 - 1.5*IQR
  limit_rigth = q75 + 1.5*IQR
  ind = which(x<= limit_left | x >= limit_rigth | is.na(x))
  return(ind)
}

2.2 Method 2: Central means

And alternative to interquantile range analysis is to explore the values that are at different standard deviations from the mean of the data (here two standard deviations). Standard deviation analysis is also a measurement of dispersion. However, it is problematic for not being resistant to the presence of outliers and, furthermore, it works under the assumption that data must follow certain normality.

\[ Limits^{t} = \bar{x}^{t} \pm n \cdot \sigma ^{t} \]

model2 =  function(x, min_x, max_x){
  ind_t = which(x< min_x | x > max_x)
  if(length(ind_t)!=0){
    x[ind_t] = NA  
  }
  sd_x = sd(x,na.rm = TRUE)
  mean_x = mean(x,na.rm = TRUE)
  limit_left = mean_x - 1.5*sd_x
  limit_rigth = mean_x + 1.5*sd_x
  #filter according with the threshold and central tendency criterium
  ind = which(x < limit_left | x > limit_rigth | is.na(x))
  return(ind)
}

function only for transformation of the data in a suitable form in my analysis

model0 =  function(x, min_x, max_x){
  ind_t = which(x< min_x | x > max_x)
  if(length(ind_t)!=0){
    x[ind_t] = NA  
  }
  #filter according with the threshold
  ind = which(is.na(x))
  return(ind)
}

2.3 Implementation

Well, basically the next function apply the previous methods and return a suitable dataframe for the use of ggplot2 library.

#This function remove the possible outliers of the data set, considering variability through the time.
removing_outliers = function(datast,
                               methodo = c("model_0","model_1", "model_2"), day_shoot,pivot = "CLASS_NAME"){
  #selecting model
  list_out = vector("list",2)
  func_choice <- switch(methodo,
                       'model_0'=model0,
                       'model_1'=model1,
                       'model_2'=model2
                       )
  names_dt = colnames(datast) 
  ind_pivot = which(names_dt == pivot)
  #keeping only name of the pivot
  names_dt2 = c(names_dt[ind_pivot],names_dt[3:length(names_dt)])
  #subsetting columns
  data_st = datast[,names_dt2]
  colnames(data_st) = c("class",names_dt[3:length(names_dt)])
  funct_order = function(x, ind) ind[x]
  st_geometry(data_st) = NULL
  output = vector("list",ncol(data_st)-1)
  names(output) = colnames(data_st[,-1])
  #loop methodology over different classes
  for(i in unique(data_st$class)){
    index = which(data_st$class == i)
    df = data_st[index,-1]
    list_ind = lapply(df,func_choice, min_x = -1, max_x = 1)
    list_ind2 = lapply(list_ind, funct_order, ind= index)
    for(j in 1:length(list_ind2)){
      output[[j]] = c(list_ind2[[j]], output[[j]])
    }
  }
  #creating new data frame
  list_df = vector("list", length(output))
  names(list_df) = names(output)
  for(k in 1:length(output)){
    nu = output[[k]]
    if(length(nu) == 0){
      list_df[[k]] = datast[,c(pivot,names(output)[k])]
    }else{
      list_df[[k]] = datast[-nu,c(pivot,names(output)[k])]
    }
  }
  names(list_df) = day_shoot
  return(list_df) 
}  
  

#============================
#adapting results to ggplot
#============================

dataframe_ggplot = function(x){
  #selecting column
  selecting_column = function(x, class_number){
    names(x)[class_number] = "selection"
    return(x$selection)
  }
  #building data frame
  df_class = lapply(x, selecting_column, 1)
  df_ndvi = lapply(x, selecting_column, 2)
  column_class = reshape2::melt(df_class)
  df_gp = cbind(column_class, unlist(df_ndvi)) 
  colnames(df_gp) = c("class","time","NDVI")
  return(df_gp)
}


#fitting a data frame with ndvi columns to one data frame suitable to be plot for ggplot
func_median = function(x) median(x, na.rm = TRUE)
func = function(x) x
trajectories_boxplot = function(df,day_shoot){
  st_geometry(df) = NULL
  df_names = colnames(df)
  colnames(df) = c("category",df_names[-1])
  output= NULL
  for(i in unique(df$category)){
    index = which(df$category == i)
    df_list = lapply(df[index,-1], func)
    names(df_list) = day_shoot
    df0 = reshape2::melt(df_list)
    df1 = cbind(df0,i)
    output = rbind(output , df1)
  }
  colnames(output) = c("NDVI","time","class")
  return(output)
}

3. Results

I will go through all the classes analysing the ndvi values per image in order to set which samples must be out of the training data.

3.1 Forest

In the following graphic we can see the dispersion of ndvi for three different classes of trees per image over the year 2017. The classes of Holm and Cork trees, and Eucaliptus depict similar dispersion and median in terms of NDVI. Instead, Coniferous trees depict lower dispersion and a lower ranges of median of NDVI values. Besides that, at the begining of he year, all three classes have precense of a high amount of apparently anomaly data. This samples with ndvi ranges lower than 0.3 can be mixtures of the orginal class with shrubs of natural vegetation. Threfore, it may be good aidea to remove this samples in order to account with samples as pure as possible in the classification.

library(reshape2)
query_dataset_trees = dataset[dataset$CLASS_NAME %in% c('Holm_and_Cork_Trees',
                                                   'Eucalyptus_trees',
                                                   'Coniferous_trees') 
                                                   ,c(-1)]

trajectories_trees  = trajectories_boxplot(query_dataset_trees, day_shoot)
trajectories_trees$class = factor(trajectories_trees$class,  labels = c('Coniferous_trees' ,'Eucalyptus_trees','Holm_and_Cork_Trees'))

ggplot(trajectories_trees, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 
<<<<<<< HEAD

=======

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f

3.1.1 Interquantile range

After implementing the first technique over the data of trees, we can see a new escenario where Eucaliptus and Coniferous trees show similar dispersion and trend. However, now, the Holm and Cork trees looks to move downward and show a lower range of ndvi values in comparition with the other two classes; this behaviour can obey to the fact that Holm and cork trees tend to be plant with certain spatial regularity, and thus depict lower amount of vegetation per area. In this sense, NDVI values are a sort of indicatior of biomass for these three species.

query_dataset_trees = dataset[dataset$CLASS_NAME %in% c('Holm_and_Cork_Trees',
                                                   'Eucalyptus_trees',
                                                   'Coniferous_trees')
                                                   ,]
trees_m1 = removing_outliers(query_dataset_trees, methodo = c("model_1"),day_shoot)
df_trees = dataframe_ggplot(trees_m1)
ggplot(df_trees, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 
<<<<<<< HEAD

=======

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f

3.1.2 Central means

The attempt of removing outliers by implementing standard deviation lead to have signals apparantly more pure. It is expectable that ndvi values of trees are higher than 0.3, so that this attempt remove most part of the sampling that can obey to a mixture of dwarf trees with other classes of vegetation than turn out not representative labels.

Moreover, this attempt imply to sacrifice more information than the previous technique, so that, we must be awere of how much information we are wealling to loss during this preprocessin stage. For example, using the previos technique we only get rid of a little amount of data that eventually dont represent more than the 2% of information. However, considering 1.5 standard deviiations the average of information out range between 7% and 15%.

trees_m2 = removing_outliers(query_dataset_trees, methodo = c("model_2"),day_shoot)
df_trees = dataframe_ggplot(trees_m2)
ggplot(df_trees, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot()
<<<<<<< HEAD

=======

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f

3.1.3 Removing NDVI values lower than 0.3 from classes of trees

This additional stage of preprocessing is mainly with the goal of removing any additional sample with ndvi value lower than 0.3 from the training set of trees.

remove_values_forest = function(x){
  colnames(x) = c("CLASS_NAME","NDVI","geometry")
  ind1 = which(x$NDVI >= 0.3)
  x1 = x[ind1,]
  return(x1)
}
#interquanile range
query_trees_m1  = lapply(trees_m1, remove_values_forest)
#central means
query_trees_m2 = lapply(trees_m2, remove_values_forest)

df_trees_plot = dataframe_ggplot(query_trees_m2)
ggplot(df_trees_plot, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot()
<<<<<<< HEAD

=======

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f

3.2 Bushes and shrubs

USing the same wrokflow for trees, we implement the same for Bushes and shrubs; except for the last part were we constrained aour data of trees to certain range of ndvi values.

library(reshape2)
bushes = dataset[dataset$CLASS_NAME %in% c("Bushes_and_shrubs"),c(-1)]
trajectories_bushes  = trajectories_boxplot(bushes, day_shoot)
ggplot(trajectories_bushes, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot()
<<<<<<< HEAD

=======

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f

3.2.1 Interquantile range

query_bushes = dataset[dataset$CLASS_NAME %in% c("Bushes_and_shrubs") ,]
query_bushes_m1 = removing_outliers(query_bushes, methodo = c("model_1"),day_shoot)
df_query_bushes_m1 = dataframe_ggplot(query_bushes_m1)
ggplot(df_query_bushes_m1, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 
<<<<<<< HEAD

=======

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f

3.2.2 Central means

query_bushes_m2 = removing_outliers(query_bushes, methodo = c("model_2"),day_shoot)
df_query_bushes_m2 = dataframe_ggplot(query_bushes_m2)
ggplot(df_query_bushes_m2, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot()
<<<<<<< HEAD

=======

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f
len_x = function(x){
  table(x$CLASS_NAME)
}
#lapply(query_bushes_m2, len_x)

As it was in the preprocessing of trees, the second technique take a range of 7% and 15% of the information out in comparition to the first techniqe thah only imply to take 2% of the information out.

<<<<<<< HEAD

3.3 Herbaceous

We have two classes called ‘Herbacous permanent’ or always green and ‘herbacous periodic’. The former class is characterized for being a temporal class, so that, I will query samples with vegetation using ndvi values larger than 0.3. Therefore, I will start with herbaceous vegetation implementing the usual tecniques so far used in this document.

3.3.1 Herbaceous Permanent

=======

3.3 Rice fields

The sowing and harvesting of rice in central of portugal differs from one place to another. HOwever, in most of the cases the sowing is done during spring. That is, months of Febrary and April. Instead, the time for harvesting the rice rangeS between September and NOvember. Accordign with the graphic of ndvi values over the year, this calendar makes a lot of sense, since the highest ndvi values belong to the period of mid season and the lowest to the sowing and harvesting. Besides that, there are several samples wich ndvi values range lower than cero, this happens during the first stage of the cycle rice production where the crops are flooded and thus response as waterlands.

ricefields = dataset[dataset$CLASS_NAME %in% c('Rice_fields') ,c(-1)]

trajectories_ricefields  = trajectories_boxplot(ricefields, day_shoot)

ggplot(trajectories_ricefields, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.3.1 Interquantile range

After implementing this first technique, we can see that large part of what believe is anomaly data using NDVI is out. Normaly this technique did not have a huge impact of removing data in the previous classes. However, in this opportunity this technique takes in average 7% of the information out. Although the number of imformation out in this opportuniy increased, there are still ndvi values that range in a lower value of cero. Therefore, in order to evoid confusion with labels of wetlands, I will remove them.

query_ricefields = dataset[dataset$CLASS_NAME %in% c('Rice_fields') ,]

ricefields_m1 = removing_outliers(query_ricefields , methodo = c("model_1"),day_shoot)
df_ricefields_m1 = dataframe_ggplot(ricefields_m1)
ggplot(df_ricefields_m1 , aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

3.3.2 Central means

With central means I avoid the process of removing ndvi values lower than cero manually. Under this technique the number of samples out range within 7 and 15%.

ricefields_m2 = removing_outliers(query_ricefields , methodo = c("model_2"),day_shoot)
df_ricefields_m2 = dataframe_ggplot(ricefields_m2)
ggplot(df_ricefields_m2 , aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 

#lapply(ricefields_m1, len_x)

3.4 Herbaceous

We have two classes called ‘Herbacous permanent’ or always green and ‘herbacous periodic’. The former class is characterized for being a temporal class, so that, I will query samples with vegetation using ndvi values larger than 0.3. Therefore, I will start with herbaceous vegetation implementing the usual tecniques so far used in this document.

3.4.1 Herbaceous Permanent

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f
herbaceous_permanent = dataset[dataset$CLASS_NAME %in% c('Herbaceous_permanet') ,c(-1)]

trajectories_herbaceous_permanent= trajectories_boxplot(herbaceous_permanent, day_shoot)

ggplot(trajectories_herbaceous_permanent, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 
<<<<<<< HEAD

3.3.1.1 Interquantile range

=======

3.4.1.1 Interquantile range

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f
query_herbaceous_permanent = dataset[dataset$CLASS_NAME %in% c('Herbaceous_permanet') ,]

herbaceous_permanent_m1 = removing_outliers(query_herbaceous_permanent , methodo = c("model_1"),day_shoot)
df_herbaceous_permanent_m1 = dataframe_ggplot(herbaceous_permanent_m1)
ggplot(df_herbaceous_permanent_m1 , aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 
<<<<<<< HEAD

3.3.1.2 Central means

=======

3.4.1.2 Central means

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f
herbaceous_permanent_m2 = removing_outliers(query_herbaceous_permanent , methodo = c("model_2"),day_shoot)
df_herbaceous_permanent_m2 = dataframe_ggplot(herbaceous_permanent_m2)
ggplot(df_herbaceous_permanent_m2 , aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 
<<<<<<< HEAD

3.3.2 Herbaceous Periodic with vegetation

=======

3.4.2 Herbaceous Periodic with vegetation

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f

In this opportunity I am not going to clean data using any of the proposed techniques. Instead, I will trace those values lower than 0.3 in herbacesous periodic to take them out and thus only consider a class with vegetation.

herbaceous_periodic = dataset[dataset$CLASS_NAME %in% c("Herbaceous_periodic") 
                                                   ,c(-2)]
trajectories_herbaceous_periodic  = trajectories_boxplot(herbaceous_periodic, day_shoot)
ggplot(trajectories_herbaceous_periodic, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot()
<<<<<<< HEAD

=======

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f
herbaceous_periodic = dataset[dataset$CLASS_NAME %in% c("Herbaceous_periodic") ,]

herbaceous_periodic_m0 = removing_outliers(herbaceous_periodic , methodo = c("model_0"),day_shoot)
#function that keeps samples with ndvi values larger than cero
vegetation = function(x){
  colnames(x) = c("CLASS_NAME","NDVI","geometry")
  ind = which(x$NDVI >= 0.3)
  return(x[ind,])
}

herbaceous_periodic_w_vegetation = lapply(herbaceous_periodic_m0, vegetation)

One proposal, may be to ensamble those samples with ndvi values lower than 0.3 in the class of not vegetated. HOwever, I prefer to work with pure labels for this task.

<<<<<<< HEAD

3.3.4 Selecting samples to create class herbaceous

=======

3.4.4 Selecting samples to create class herbaceous

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f

From the static modelling perspective, herbacous periodic with vegetation and herbaceous permanent are merged in order to work with inly one class. THerefore,I am going to select randomly 500 samples per class with vegetation and create the class herbceous.

query_random = function(x){
    colnames(x) = c("CLASS_NAME","NDVI","geometry")
    if(nrow(x) >= 500){
      ind_x = sample(1:nrow(x),500)
      x$CLASS_NAME = "Herbaceous"
      return(x[ind_x,])
    }
      x$CLASS_NAME = "Herbaceous"
      return(x)
}
herbaceous_1 = lapply(herbaceous_permanent_m2 , query_random)
herbaceous_2 = lapply(herbaceous_periodic_w_vegetation ,query_random)
<<<<<<< HEAD

3.4 Non - vegetated

=======

3.5 Non - vegetated

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f

Non-vegetated class tend to depict near infrared reflectance somehow larger than the red. This sligh difference turns out in low positive values of NDVI. According with the literature (cite) we may have this classs in the range of 0 and 0.3 ndvi. Let’s have a look of the NDVI signal over the year 2017 and evaluate what percentage of the information meets this criterion.

query_dataset_no_vegetated = dataset[dataset$CLASS_NAME %in% c("Non_vegetated") 
                                                   ,]
table_nv = table(query_dataset_no_vegetated$DESCRICAO)
df_nv = data.frame(table_nv, pos= as.numeric(table_nv)*0.5)
ggplot(data=df_nv , aes(x=Var1, y=Freq)) +
  coord_flip() +
  geom_bar(stat="identity", fill=c("firebrick2","steelblue","olivedrab4")) +
  geom_text(aes(label = Freq, y= pos))

query_dataset_no_vegetated = dataset[dataset$CLASS_NAME %in% c("Non_vegetated") 
                                                   ,c(-2)]
trajectories_no_vegetated  = trajectories_boxplot(query_dataset_no_vegetated , day_shoot)
ggplot(trajectories_no_vegetated, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 
<<<<<<< HEAD

### 3.4.1 Interquantile range

=======

### 3.5.1 Interquantile range

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f

since sparse vegetation and bare rocks depic ndvi values larger than cero I will take them out of the sampling.

query_dataset_no_vegetated = dataset[dataset$CLASS_NAME %in% c("Non_vegetated") ,]
non_vegetated_m1 = removing_outliers(query_dataset_no_vegetated , methodo = c("model_1"),day_shoot,pivot = "CLASS_NAME")
df_non_vegetated_m1 = dataframe_ggplot(non_vegetated_m1)
ggplot(df_non_vegetated_m1, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot()
<<<<<<< HEAD

### 3.4.2 central means

=======

### 3.5.2 central means

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f
non_vegetated_m2 = removing_outliers(query_dataset_no_vegetated , methodo = c("model_2"),day_shoot,pivot = "CLASS_NAME")
df_non_vegetated_m2 = dataframe_ggplot(non_vegetated_m2)
ggplot(df_non_vegetated_m2, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 
<<<<<<< HEAD

3.5 Water

=======

3.6 Water

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f

Class water is composed for several categories, I want to explore how many samples we have per class using as reference 1000 we have.

dataset_water = dataset[dataset$CLASS_NAME == "Water" ,] 
table_water = table(dataset_water$DESCRICAO)
df_water = data.frame(table_water, pos= as.numeric(table_water)*0.5)
ggplot(data=df_water , aes(x=Var1, y=Freq)) +
  coord_flip() +
  geom_bar(stat="identity", fill="steelblue") +
  geom_text(aes(label = Freq, y= pos))

<<<<<<< HEAD

3.5.1 Interquantile range

=======

3.6.1 Interquantile range

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f
query_dataset_water = dataset[dataset$CLASS_NAME %in% c("Water"),]
water_m1 = removing_outliers(query_dataset_water , methodo = c("model_1"),day_shoot)
df_water_m1 = dataframe_ggplot(water_m1)
ggplot(df_water_m1, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 
<<<<<<< HEAD

3.5.2 central means

=======

3.6.2 central means

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f
water_m2 = removing_outliers(query_dataset_water , methodo = c("model_2"),day_shoot)
df_water_m2 = dataframe_ggplot(water_m2)
ggplot(df_water_m2, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 
<<<<<<< HEAD

=======

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f
#lapply(water_m2, len_x)
<<<<<<< HEAD

3.6 Sealed

=======

3.7 Sealed

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f

Since the spectral response defined by the pixel obeys to the predominant land cover type around the point label and not necessarily to the type which the point is representing, it is advisable to get rid of those points of sealead where I pressume there is mixture with vegetation.

Here class sealed is composed for different categories. Let’s have a look of how many samples we have per category.

dataset_sealed = dataset[dataset$CLASS_NAME == "Sealed" , ]
table_sealed = table(dataset_sealed$DESCRICAO)
df_sealed = data.frame(table_sealed , pos= as.numeric(table_sealed)*0.5)
ggplot(data=df_sealed , aes(x=Var1, y=Freq)) +
  coord_flip() +
  geom_bar(stat="identity", fill="steelblue") +
  geom_text(aes(label = Freq, y= pos))

let’s see the trajectories

dataset_sealed = dataset[dataset$CLASS_NAME == 'Sealed',-1]

trajectories_sealed  = trajectories_boxplot(dataset_sealed , day_shoot)
ggplot(trajectories_sealed, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 
<<<<<<< HEAD

3.6.1 Interquantile range

=======

3.7.1 Interquantile range

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f
dataset_sealed = dataset[dataset$CLASS_NAME == 'Sealed',]
query_dataset_sealed = dataset_sealed[!(dataset_sealed$DESCRICAO %in% c("1.1.2.01.1 Tecido urbano descontínuo",
                                                        "1.1.2.02.1 Tecido urbano descontínuo esparso",
                                                        "1.2.1.03.1 Instalações agrícolas"
                                                        )),]
sealed_m1 = removing_outliers(query_dataset_sealed , methodo = c("model_1"),day_shoot)
df_sealed_m1 = dataframe_ggplot(sealed_m1)
ggplot(df_sealed_m1, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 
<<<<<<< HEAD

3.6.2 Central means

=======

3.7.2 Central means

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f
sealed_m2 = removing_outliers(query_dataset_sealed , methodo = c("model_2"),day_shoot)
df_sealed_m2 = dataframe_ggplot(sealed_m2)
ggplot(df_sealed_m2, aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 
<<<<<<< HEAD

3.7 Wetlands

3.7.1 Central means

=======

3.8 Wetlands

3.8.1 Central means

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f
dataset_wetlands = dataset[dataset$CLASS_NAME == "Wetlands",]
wetlands_m2 = removing_outliers(dataset_wetlands , methodo = c("model_2"),day_shoot)
df_wetlands_m2 = dataframe_ggplot(wetlands_m2)
ggplot(df_wetlands_m2 , aes(x=time, y=NDVI, fill=class)) + 
   theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 12),legend.position="bottom",legend.text=element_text(size=12)) +
    geom_boxplot() 
<<<<<<< HEAD

=======

>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f

4. Multitemporal data interquantile range analysis

<<<<<<< HEAD
datafinal = list(query_trees_m2, query_bushes_m2, 
=======
datafinal = list(query_trees_m2, query_bushes_m2, ricefields_m2, 
>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f
                 herbaceous_1, herbaceous_2, non_vegetated_m2,
                 water_m2, sealed_m2, wetlands_m2)

out = NULL
merge_list_class = function(x){
  for(i in 1:length(x)){
    df0 = cbind(names(x[i]),x[[i]])
    colnames(df0) = c("time","CLASS_NAME","NDVI", "geometry")
    out = rbind(df0, out)
  }
  return(out)
}

merge1 = lapply(datafinal,merge_list_class)
outfinal= NULL
for(i in 1:length(merge1)){
    outfinal = rbind(merge1[[i]],outfinal)
}

graphic

conteo_df = function(x){
  return(table(x$CLASS_NAME))
}

listdata = split(outfinal,outfinal$time)
conteo_list = lapply(listdata,conteo_df)
out_conteo = NULL
for(k in 1:length(conteo_list)){
  df0 = data.frame(conteo_list[[k]],time = names(conteo_list[k]))
  out_conteo = rbind(df0,out_conteo)
}
#class_names
factor_classes = unique(out_conteo$Var1)

#sum per class
fun_sum = function(x) sum(x$Freq)
conteo_total = lapply(split(out_conteo,out_conteo$time),fun_sum)
conteo_total = melt(conteo_total)

#defining position of the text
out_conteo$halffreq = out_conteo$Freq *0.5
out_conteo$pos = 1
for(i in unique(out_conteo$time)){
  ind1 = which(out_conteo$time == i)
  out1 = out_conteo[ind1,]
  out0 = 0
  id_conteo = which(conteo_total$L1 == i)
  for(j in factor_classes){
    ind2 = which(out1$Var1 == j)
    out0 = out0 + out1[ind2,"Freq"]
    out_conteo[ind1[ind2],"pos"] = conteo_total[id_conteo,"value"]  - (out0 - out_conteo[ind1[ind2],"halffreq"])
  }
}

ggplot graphic

out_conteo$Class = factor(out_conteo$Var1, factor_classes)
#colors in the graphic
<<<<<<< HEAD
colors_classes = c("#31670a","#e70f25", "#6ddb42", "#fbff39", "#f48002", "#8505ca", "gray", "#2335b9","#0f99de")
=======
colors_classes = c("green4","gold1", "green3", "green1", "springgreen3", "springgreen1", "darkolivegreen3", "turquoise","plum2","dodgerblue2")
>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f
names(colors_classes) = factor_classes

ggplot(data=out_conteo, aes(x=time, y=Freq, fill= Class)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size= 13), legend.text=element_text(size=15)) +
  geom_bar(stat="identity") +
  scale_fill_manual("Classes", values = colors_classes) +
<<<<<<< HEAD
  geom_text(aes(label = Freq,y = pos),size = 4)

#total number of data per image
func_sum_samples = function(x){sum(x$Freq)}
as.data.frame(lapply(split(out_conteo,out_conteo$time),func_sum_samples))
##   X2017.01.15 X2017.04.05 X2017.05.25 X2017.06.14 X2017.07.29 X2017.08.13
## 1        7698        7629        7461        7418        7194        7215
##   X2017.09.12 X2017.10.12 X2017.11.16 X2017.12.16
## 1        7165        7132        7639        7612
======= geom_text(aes(label = Freq,y = pos),size = 6)

#total number of data per image
func_sum_samples = function(x){sum(x$Freq)}
as.data.frame(lapply(split(out_conteo,out_conteo$time),func_sum_samples))
##   X2017.01.15 X2017.04.05 X2017.05.25 X2017.06.04 X2017.06.14 X2017.07.04
## 1        8570        8519        8334        8454        8291        8370
##   X2017.07.09 X2017.07.14 X2017.07.24 X2017.07.29 X2017.08.03 X2017.08.08
## 1        8152        8272        8172        8068        8179        8182
##   X2017.08.13 X2017.08.18 X2017.09.02 X2017.09.07 X2017.09.12 X2017.09.22
## 1        8084        8041        8272        8243        8019        8066
##   X2017.09.27 X2017.10.02 X2017.10.12 X2017.10.22 X2017.10.27 X2017.11.11
## 1        8096        7998        8024        8310        8238        8560
##   X2017.11.16 X2017.11.21 X2017.12.01 X2017.12.16 X2017.12.21
## 1        8547        8532        8518        8498        8498
>>>>>>> ac173c5f72d856c3a48e1b2c234e6bde1e2aa60f

exporting some results

#folder = '/home/user/Documents/TESISMASTER/VECTOR/training_data_static_model_centralmeans/'

#for(k in unique(outfinal$time)){
#  ind_t = which(outfinal$time == k)
#  create_folder = paste0(folder,as.character(k))
#  dir.create(create_folder)
#  sf::write_sf(outfinal[ind_t,c("CLASS_NAME")], paste0(create_folder,"/training_samples_cm.shp"))
#}